In [400]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

# For getting the colour map
import matplotlib.cm as cm

#For clustering and creating dendogram
from scipy.cluster.hierarchy import dendrogram, linkage

# For labels of the dendogram plot
import pylab

# for scatter matrix of numerical variables
from pandas.plotting import scatter_matrix

#Import for running a t test
from scipy.stats import ttest_ind

#Import stats models for Logistic Regression for stastical Inference
import statsmodels.formula.api as smf

#Import DateTime to manipulate datetime columns
import datetime

# Import Plotly for graphs
import plotly as py
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly import tools
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

README

This Exploratory Data Analysis Assignment is done in Six sections.

Section -1 Common Functions:

In this section we define the common functions that we use for analysis of different columns.

Section -2 Preprocessing:

In this section we analyse the columns for null and single values and drop the columns
We also do pre processing of some columns like zipcode, etc.

Section 3:

In this section is divided into the following subsections. We do the following for each column

Data Analysis

percentiles, null values, outliers
Data Distribution - histogram, bar chart. box chart.

Data Cleanup.

Data Insights

Univariate Analysis
Segmented univariate analysis

t-test (To conclude after the advanced statistics course)

Section 4:

Correlation analysis, Bivariate Analysis, Derived Variables.

Section 5:

Logistic Regression for statistical Inference

Section 6:

Most Affected Columns & Univariate Analysis

----------------- Section:1 Common Functions ------------------------

In [401]:
# Helper function to do analysis of numeric variable.
# Compute the percentile, number of null values, histogram, box plot by loan status
def do_analysis(dataframe, column_name):
    print(dataframe[column_name].describe(percentiles=np.linspace(0,1,11)))
    print("Number of null values for the column {0} : {1}".format(column_name, dataframe[column_name].isnull().sum()))
    plt.figure(figsize=(10,10))
    plt.subplot(311)
    dataframe[column_name].plot(kind='hist', bins=50)
    plt.subplot(312)
    dataframe[column_name].plot(kind='box')
    dataframe[[column_name, 'loan_status']].boxplot(by='loan_status')
In [402]:
# Helper function to print the analysis of categorical variables
# Get the value counts for the different values, generate a pivot table by loan status.
# Generate a bar chart by loan status for this categorical variable
def do_analysis_categorical(dataframe, column_name):
    print(dataframe[column_name].value_counts())
    # Segmented Univariate Analysis
    pivot = pd.pivot_table(dataframe, values = 'policy_code',
                         index = [column_name], columns = ['loan_status'], aggfunc = np.sum, margins=True)
    pivot['Paid Percentage'] = pivot['Fully Paid']/pivot['All']
    pivot['Charged Off Percentage'] = 1 - pivot['Paid Percentage']
    pivot['Charged Off Percentage'] = pivot['Charged Off Percentage'] * 100
    pivot['Paid Percentage'] = pivot['Paid Percentage'] * 100
    print(pivot)
    pd.pivot_table(dataframe, values = 'policy_code', index = [column_name],
               columns = ['loan_status'], aggfunc = np.sum).plot(kind='bar', stacked=True, figsize=(24, 8))
In [403]:
# Helper function to run a t test between charged_off and fully paid population.
def run_t_test(dataframe, column_name):
    charged_off = dataframe.loc[dataframe['loan_status'] == 'Charged Off']
    fully_paid = dataframe.loc[dataframe['loan_status'] == 'Fully Paid']
    print(ttest_ind(charged_off[column_name], fully_paid[column_name]))
    
In [404]:
#All input dates are in MON-YY Format e.g: DEC-11
# This function converts the date into an integer
def changeDate2SequenceOfInt(inputDate):
    if(pd.isnull(inputDate)):
        return 0
    else:
        dt = datetime.datetime.strptime(inputDate,"%b-%y")
        return int(str(dt.year) + str(dt.strftime('%m')))
In [405]:
# Get the percentage of null columns for the input column.
def checkNullPerForColumns(columnName):
    print ("Null Percentage of the Column is: ==> ", round(100*(lending_club_master.loc[:,columnName].isnull().sum()/len(lending_club_master.loc[:,columnName].index)), 2))
In [406]:
#Plot CountPlot and TotalAmount Plot Based on LoanAmount
def pltUniVariableAnalysis(column, df):
    plt.figure(figsize=(10,10))
 
    y = 'loan_amnt'
    hue = 'loan_status'
   
    plt.subplot(2,1,1)
    p = sns.barplot(x = column, y = y, hue=hue, data = df, estimator=np.sum)
    p.set_xticklabels(p.get_xticklabels(), rotation = 90)
    #plt.title(sumTitle)
    plt.subplot(2,1,2)
    q = sns.countplot(x = column, hue = hue, data = df)
    q.set_xticklabels(q.get_xticklabels(), rotation = 90)
    #plt.title(countTitle)
    plt.show()
In [407]:
def createDict(columnName, dataFrame):
    unique_columnValues = dataFrame.loc[:,columnName].unique()
    unique_Count = len(unique_columnValues)
    columnName_col2Val = dict(zip(unique_columnValues,np.arange(unique_Count)))
    columnName_val2Col = dict(zip(np.arange(unique_Count), unique_columnValues))
    return columnName_col2Val, columnName_val2Col
In [408]:
#Plot Categorical Based on Loan_Amount With Title as Parameter. Draw 2 Plots to Show the Comparision of Loan Status
def pltCategorical(columnName, title1, title2):
    X1 = lending_club.pivot_table(index=columnName, values = 'loan_amnt', columns='loan_status', aggfunc='sum').reset_index()
    plt.figure(figsize=(16,14))
    plt.subplot(2, 1, 1)
    p = sns.barplot(columnName, 'Charged Off', data = X1.sort_values(by = 'Charged Off', ascending = False))
    p.set_xticklabels(p.get_xticklabels(),rotation=30)
    plt.title(title1)
    plt.subplot(2,1,2)
    q = sns.barplot(columnName, 'Fully Paid', data = X1.sort_values(by = 'Fully Paid', ascending = False))
    q.set_xticklabels(q.get_xticklabels(),rotation=30)
    plt.title(title2)
    plt.show()
In [409]:
#Create a DataFrame to Group LateFeePercent Based on Loan_Status
def createDF_LateFeePrecent_Categories(categoryColumn):
    X1 = lending_club.pivot_table(index = categoryColumn, values = 'lateFeePrecent', columns=['loan_status'], aggfunc='mean').reset_index()
    X1['LateFeePercent_diff'] = (X1['Charged Off'] - X1['Fully Paid'])
    X1 = X1.sort_values(by='LateFeePercent_diff', ascending = False)
    return X1

#Plot using Dataframe output from "createDF_LateFeePrecent_Categories" function
def pltLateFeePercentOnCategies(categoryColumn, dataFrame):
    plt.figure(figsize=(15,8))
    p = sns.barplot(x = categoryColumn,y='LateFeePercent_diff', data = dataFrame.sort_values(by='LateFeePercent_diff', ascending = False))
    plt.ylabel('Charged-Off LateFee Percent On Fully Paid')
    p.set_xticklabels(p.get_xticklabels(), rotation = 90, fontsize = 12)
    plt.title("More LateFee Precent by ChargedOff Subscriber on FullyPaid Subscriber")

    plt.show() 
In [410]:
# percentile analysis
def percentileanalysis(column_name,hue):
 temp = pd.Series(data = hue)
 fig, ax = plt.subplots()
 width = 18
 fig.set_size_inches(width , 7)
 ax = sns.countplot(data = lending_club, x= column_name, order=lending_club[column_name].value_counts().index,hue = hue) 
 if len(temp.unique()) > 0:
    for p in ax.patches:
        ax.annotate('{:1.1f}%'.format((p.get_height()*100)/float(len(lending_club))), (p.get_x()+0.05, p.get_height()+20))  
 else:
    for p in ax.patches:
        ax.annotate(p.get_height(), (p.get_x()+0.32, p.get_height()+20)) 
    del temp
In [411]:
# Helper functions to do analysis of numeric variable and draw graphs using plotly
def do_analysis_with_plotly_graphs(dataframe, column_name, label):
    print(dataframe[column_name].describe(percentiles=np.linspace(0,1,11)))
    print("Number of null values for the column {0} : {1}".format(column_name, dataframe[column_name].isnull().sum()))
    histogram = go.Histogram(x=dataframe[column_name], name=label)
    boxplot = go.Box(y=dataframe[column_name], name=label)
    fully_paid = dataframe.drop(dataframe[dataframe.loan_status != 'Fully Paid'].index)
    charged_off = dataframe.drop(dataframe[dataframe.loan_status != 'Charged Off'].index)
    paid_box = go.Box(y=fully_paid[column_name], name='Paid Off')
    charged_off_box = go.Box(y=charged_off[column_name], name='Charged Off')
    py.offline.iplot([histogram])
    trace1 = go.Histogram(x=charged_off[column_name], name= 'Defaulted')
    trace0 = go.Histogram(x=fully_paid[column_name], name= 'Paid')
    data = [trace0, trace1]
    layout = go.Layout(barmode='stack')
    fig = go.Figure(data=data, layout=layout)
    py.offline.iplot(fig)
    data = [boxplot, paid_box, charged_off_box]
    py.offline.iplot(data)
    dist_data = [dataframe[column_name]]
    dist_labels = [label]
    fig = ff.create_distplot(dist_data, dist_labels)
    py.offline.iplot(fig)
    dist_data = [fully_paid[column_name], charged_off[column_name]]
    dist_labels = ['Paid', 'Defaulted']
    fig = ff.create_distplot(dist_data, dist_labels)
    py.offline.iplot(fig)
In [412]:
#Helper Function to Plot the Top Category Distribution in FullyPaid and ChargedOff Subscriber
# Bar Plot shows the Percentage of Total Count Based on Category for total DataFrame (FullyPaid and ChargedOff)
# Line Plot shows the Percentage of Total Count Based on Category (ChargedOff)
def categorical_Percentage_Distribution(columnName, title):
    totalCount = lending_club[columnName].count()
    X1 = pd.DataFrame(lending_club.groupby(columnName).policy_code.agg(['count'])).reset_index()
    X1['GroupPercentage'] = X1['count']/totalCount
    lending_club_ChargedOff = lending_club.loc[lending_club['loan_status'] == 'Charged Off']
    X2 = lending_club_ChargedOff.groupby(columnName).policy_code.agg(['count']).reset_index()
    X2.rename(columns={'count':'GroupwiseChargedOffCount'}, inplace = True)
    X2 = pd.merge(X1,X2, on = columnName)
    X2['ChargedOffPercent'] = X2['GroupwiseChargedOffCount']/X2['count']
    print(X2)
    plt.figure(figsize=(12, 8))
    ax = plt.subplot(111)
    p = sns.barplot(x=columnName, y='GroupPercentage', data=X2)
    p.set_xticklabels(p.get_xticklabels(), rotation = 90, fontsize= 12)
    plt.ylabel('Percentage',fontsize = 12)
    plt.xlabel(columnName, fontsize = 12)
    plt.plot(X2['ChargedOffPercent'], 'r-')
    plt.title(title, fontsize = 12)
    ax.legend(loc='upper center', bbox_to_anchor=(0.8, 1.00), shadow=True, ncol=2, fontsize = 10)
    plt.show()

----------------- SECTION:2 PreProcessing ------------------------

1. Data Analysis

==> Read Loan.csv file

In [413]:
lending_club = pd.read_csv('loan.csv')
C:\Users\Ranjan\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py:2728: DtypeWarning:

Columns (47) have mixed types. Specify dtype option on import or set low_memory=False.

==> Make default datatype as string for 47th Column avoid load error

In [414]:
print("Column Name Which Failed to Load ==> ", lending_club.columns[47])
customDataType = dict(zip(lending_club.columns,lending_club.dtypes)) #Get the Datatype
customDataType['next_pymnt_d'] = 'str'
lending_club = pd.read_csv('loan.csv',dtype=customDataType) #Reload
Column Name Which Failed to Load ==>  next_pymnt_d

==> Find all the Columns with 100% NULL and create a Dictionary

In [415]:
#Get Null% in each Column of loan dataframe
nullColumns = pd.DataFrame(round(100*(lending_club.isnull().sum()/len(lending_club.index)), 2), columns=['null%Column'])
#Create a Map Between Columns to Null% in DataFrame
dictColumn2NullPercent = dict(zip(nullColumns.index,nullColumns['null%Column']))
In [416]:
#Create the DataFrame with NullPercentage and BinaryCategory of Null and Not-Null
LendingClubCol_Df = pd.DataFrame(np.array(lending_club.columns), columns=['lending_club_columns'])
LendingClubCol_Df['NullPrecentage'] = LendingClubCol_Df['lending_club_columns'].apply(lambda x: dictColumn2NullPercent.get(x))
LendingClubCol_Df['NullPrecentageMoreThan95%'] = LendingClubCol_Df['NullPrecentage'].apply(lambda x: 'Null_Columns' if x > 95.0 else 'Non Null_Columns')
LendingClubCol_Df.head()
Out[416]:
lending_club_columns NullPrecentage NullPrecentageMoreThan95%
0 id 0.0 Non Null_Columns
1 member_id 0.0 Non Null_Columns
2 loan_amnt 0.0 Non Null_Columns
3 funded_amnt 0.0 Non Null_Columns
4 funded_amnt_inv 0.0 Non Null_Columns
In [417]:
#Plot Each Column with NUll %
plt.figure(figsize=(24,10))
plt.subplot(2,1,1)
p = sns.barplot(x = 'lending_club_columns', y = 'NullPrecentage', data = LendingClubCol_Df)
plt.xlabel('Null Percentage Of Columns', fontsize = 20)
plt.ylabel('Null %', fontsize= 20)
p.set_xticklabels(p.get_xticklabels(),rotation=90, fontsize = 15)
plt.title('Columns to Null%')
plt.show()
In [418]:
#Plot a Count Plot to see how many with maximum NULL value and how many columns can be considered for the analysis
plt.figure(figsize=(6, 15))

plt.subplot(2, 1, 1)
q = sns.countplot(x='NullPrecentageMoreThan95%',  data=LendingClubCol_Df)
q.set_xticklabels(q.get_xticklabels(),rotation=30)
plt.title('Null Column Analysis')
plt.show()
In [419]:
LendingClubCol_Df = LendingClubCol_Df.groupby('NullPrecentage').lending_club_columns.agg(['count']).sort_values(by='count', ascending=False).reset_index()
plt.figure(figsize=(18,6))
p = sns.barplot(x = 'NullPrecentage', y = 'count', data = LendingClubCol_Df)
plt.xlabel('Null Percentage Of Columns', fontsize = 12)
plt.ylabel('Count of Null %', fontsize= 12)
plt.title('Columns to Null Percentage')
plt.show()

==> Remove all the columns from loan df having More than 95% Null values

In [420]:
#np Vectorized Function to Filter Null Columns from the Loan Dataframe
Func_filterNullCol = np.vectorize(lambda x: True if dictColumn2NullPercent.get(x) < 100 else False)
Func_filterNullCol(np.array(lending_club.columns))
#Drop All the Null COlumns
lending_club = lending_club.loc[:,Func_filterNullCol(np.array(lending_club.columns))]
lending_club.head()
Out[420]:
id member_id loan_amnt funded_amnt funded_amnt_inv term int_rate installment grade sub_grade ... next_pymnt_d last_credit_pull_d collections_12_mths_ex_med policy_code application_type acc_now_delinq chargeoff_within_12_mths delinq_amnt pub_rec_bankruptcies tax_liens
0 1077501 1296599 5000 5000 4975.0 36 months 10.65% 162.87 B B2 ... NaN May-16 0.0 1 INDIVIDUAL 0 0.0 0 0.0 0.0
1 1077430 1314167 2500 2500 2500.0 60 months 15.27% 59.83 C C4 ... NaN Sep-13 0.0 1 INDIVIDUAL 0 0.0 0 0.0 0.0
2 1077175 1313524 2400 2400 2400.0 36 months 15.96% 84.33 C C5 ... NaN May-16 0.0 1 INDIVIDUAL 0 0.0 0 0.0 0.0
3 1076863 1277178 10000 10000 10000.0 36 months 13.49% 339.31 C C1 ... NaN Apr-16 0.0 1 INDIVIDUAL 0 0.0 0 0.0 0.0
4 1075358 1311748 3000 3000 3000.0 60 months 12.69% 67.79 B B5 ... Jun-16 May-16 0.0 1 INDIVIDUAL 0 0.0 0 0.0 0.0

5 rows × 57 columns

==> Drop Columns with Single Unique Value as per Metadata Analysis

In [421]:
unncessaryColumns = ['desc', 'title', 'initial_list_status','application_type',
                     'acc_now_delinq', 'delinq_amnt', 'pymnt_plan', 'url']
lending_club.drop(columns=unncessaryColumns, inplace=True)
In [422]:
print("********Columns Considered for Analysis as follows:====>\n ", lending_club.columns)
print("********Total Number of Columns Considered:====> ", len(lending_club.columns))
********Columns Considered for Analysis as follows:====>
  Index(['id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv',
       'term', 'int_rate', 'installment', 'grade', 'sub_grade', 'emp_title',
       'emp_length', 'home_ownership', 'annual_inc', 'verification_status',
       'issue_d', 'loan_status', 'purpose', 'zip_code', 'addr_state', 'dti',
       'delinq_2yrs', 'earliest_cr_line', 'inq_last_6mths',
       'mths_since_last_delinq', 'mths_since_last_record', 'open_acc',
       'pub_rec', 'revol_bal', 'revol_util', 'total_acc', 'out_prncp',
       'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp',
       'total_rec_int', 'total_rec_late_fee', 'recoveries',
       'collection_recovery_fee', 'last_pymnt_d', 'last_pymnt_amnt',
       'next_pymnt_d', 'last_credit_pull_d', 'collections_12_mths_ex_med',
       'policy_code', 'chargeoff_within_12_mths', 'pub_rec_bankruptcies',
       'tax_liens'],
      dtype='object')
********Total Number of Columns Considered:====>  49

2. Basic Data Cleanup for All Columns

==> int_rate

In [423]:
lending_club['int_rate'] = round(lending_club['int_rate'].apply(lambda x: (str(x).split("%")[0])).apply(pd.to_numeric),0)

==> installment

In [424]:
lending_club['installment'] = round(lending_club['installment'].apply(pd.to_numeric),0)

==> emp_length

In [425]:
#Remove > and < Symbol and < 1 Value set to 0
lending_club['emp_length'] = lending_club['emp_length'].apply(lambda x: str(x).split("year")[0].\
                                                    strip().split("+")[0]).apply(lambda x: 0 if str(x).find("<") >=0 else x)

==> home_ownership

In [426]:
# Only 3 Records in NONE Category with Full Paid, Hence same record is discarded
lending_club = lending_club[lending_club['home_ownership']  != 'NONE']

==> issue_d, earliest_cr_line

In [427]:
#lending_club['issue_d'] = lending_club['issue_d'].apply(changeDate2SequenceOfInt)
#lending_club['earliest_cr_line'] = lending_club['earliest_cr_line'].apply(changeDate2SequenceOfInt)

==> zip_code

In [428]:
lending_club['zip_code'] = lending_club['zip_code'].apply(lambda x: str(x)[0:3])

==> next_pymnt_d, last_pymnt_d, last_credit_pull_d

In [429]:
lending_club['next_pymnt_d'] = lending_club['next_pymnt_d'].apply(changeDate2SequenceOfInt)
#lending_club['last_pymnt_d'] = lending_club['last_pymnt_d'].apply(changeDate2SequenceOfInt)
lending_club['last_credit_pull_d'] = lending_club['last_credit_pull_d'].apply(changeDate2SequenceOfInt)

==> Data Cleanup loan_status
Since the Current loan statuses cannot be used to determine whether a loan is paid or charged off,
We will remove the loan_status with the value Current. We will save it in a separate dataframe for future analysis

In [430]:
lending_club_master = lending_club
lending_club_current = lending_club.loc[lending_club['loan_status'] == 'Current']
lending_club = lending_club.loc[lending_club['loan_status'] != 'Current']
lending_club_ChargedOff = lending_club.loc[lending_club['loan_status'] == 'Charged Off']

----------------- SECTION:3 Univariate Analysis ------------------------

1. Data Analysis for loan_amnt

In [431]:
do_analysis(lending_club, 'loan_amnt')
count    38574.000000
mean     11047.449059
std       7348.517888
min        500.000000
0%         500.000000
10%       3100.000000
20%       5000.000000
30.0%     6000.000000
40%       7750.000000
50%       9600.000000
60.0%    11200.000000
70%      14000.000000
80%      16000.000000
90%      21600.000000
100%     35000.000000
max      35000.000000
Name: loan_amnt, dtype: float64
Number of null values for the column loan_amnt : 0

Insight.

The loan amount has spikes at the round numbers like 5, 000$, 10,000 $, 15, 000$, 20, 000$ 25, 000 $ etc.
This can be an input to the UX team or the Business Development team to create loan packages at these round numbers.

In [432]:
run_t_test(lending_club, 'loan_amnt')
Ttest_indResult(statistic=11.694718423883753, pvalue=1.533449120688028e-31)

Insight.

Visually, there seems to be a slightdifference in the means of the loan amount for charged off and fully paid.
However, T Test indicates that loan amount has a bearing on the loan defaults

Data Cleanup for loan_amnt

Data Cleanup: We will not remove any row for loan_amnt as all the rows seem valid.

Insights for loan_amnt

In [433]:
lending_club[['annual_inc', 'loan_amnt']].corr()
Out[433]:
annual_inc loan_amnt
annual_inc 1.000000 0.269045
loan_amnt 0.269045 1.000000

Insight.

The Loan amount is corelated positively with the annual income

2. Data Analysis for funded_amnt

In [434]:
# Is funded_amnt same as loan amnt. 
#What % of rows where funded amount and loan amount are not same.
percent_difference = lending_club.loc[lending_club['loan_amnt'] != lending_club['funded_amnt']]['funded_amnt'].shape[0]/ lending_club.shape[0]
print("% .2f" % (percent_difference * 100))
 4.55
In [435]:
# Is the monetary difference more. yes
(lending_club['loan_amnt'] - lending_club['funded_amnt']).sum()
Out[435]:
10144475
In [436]:
do_analysis(lending_club, 'funded_amnt')
count    38574.000000
mean     10784.461684
std       7090.379936
min        500.000000
0%         500.000000
10%       3000.000000
20%       5000.000000
30.0%     6000.000000
40%       7500.000000
50%       9550.000000
60.0%    10875.000000
70%      13000.000000
80%      16000.000000
90%      20000.000000
100%     35000.000000
max      35000.000000
Name: funded_amnt, dtype: float64
Number of null values for the column funded_amnt : 0

Insight.

Charged off has a higher average funded amount than the fully paid segment
Also, since the loan_amount and funded_amount are highly corelated, and there is not a lot a difference, we can just use the funded_amount column and drop the loan amount column.

Data Cleanup for funded_amnt

Insights for funded_amnt

In [437]:
# corelation between loan amount and funded amount
lending_club[['loan_amnt', 'funded_amnt']].corr()
Out[437]:
loan_amnt funded_amnt
loan_amnt 1.000000 0.981789
funded_amnt 0.981789 1.000000

3. Data Analysis for funded_amnt_inv

In [438]:
do_analysis(lending_club, 'funded_amnt_inv')
count    38574.000000
mean     10223.152285
std       7022.580869
min          0.000000
0%           0.000000
10%       2827.665000
20%       4375.162000
30.0%     5600.000000
40%       7000.000000
50%       8736.649151
60.0%    10000.000000
70%      12150.000000
80%      15000.000000
90%      19975.000000
100%     35000.000000
max      35000.000000
Name: funded_amnt_inv, dtype: float64
Number of null values for the column funded_amnt_inv : 0

4. Data Analysis for term

In [439]:
lending_club['term'].value_counts()
Out[439]:
 36 months    29093
 60 months     9481
Name: term, dtype: int64
In [440]:
do_analysis_categorical(lending_club, 'term')
 36 months    29093
 60 months     9481
Name: term, dtype: int64
loan_status  Charged Off  Fully Paid    All  Paid Percentage  \
term                                                           
 36 months          3227       25866  29093        88.907985   
 60 months          2400        7081   9481        74.686215   
All                 5627       32947  38574        85.412454   

loan_status  Charged Off Percentage  
term                                 
 36 months                11.092015  
 60 months                25.313785  
All                       14.587546  

Insight : term

11% of customers with tenure of 36 months default
25.42 % of the customers with tenure of 60 months default
Higher tenure leads to higher default rates.

5. Data Analysis for installment

In [441]:
lending_club['installment'].value_counts().head()
#lending_club['installment'].isnull().sum()
Out[441]:
156.0    250
311.0    194
187.0    172
249.0    154
313.0    151
Name: installment, dtype: int64
In [442]:
do_analysis(lending_club, 'installment')
count    38574.000000
mean       322.481490
std        208.637858
min         16.000000
0%          16.000000
10%         99.000000
20%        150.000000
30.0%      186.000000
40%        229.000000
50%        278.000000
60.0%      328.000000
70%        387.000000
80%        480.000000
90%        621.000000
100%      1305.000000
max       1305.000000
Name: installment, dtype: float64
Number of null values for the column installment : 0
In [443]:
# corelation between loan amount and funded amount
lending_club[['loan_amnt', 'installment']].corr()
Out[443]:
loan_amnt installment
loan_amnt 1.000000 0.932256
installment 0.932256 1.000000
In [444]:
#categorizing the installment to bins to get better insights
def categorizefunc(x):
    if (x>0 and x<=200) :
        return '0-200' 
    elif (x>200 and x<=400):
        return '200-400' 
    elif(x>400 and x<=600): 
        return '400-600' 
    elif(x>600 and x<=800): 
        return '600-800' 
    elif(x>800 and x<=1000):
        return '800-1000'
    else:
        return '1000>'
                                                                          
lending_club['categorical_installment']=lending_club['installment'].apply(categorizefunc)
lending_club['categorical_installment']
do_analysis_categorical(lending_club, 'categorical_installment')
200-400     14708
0-200       13112
400-600      6548
600-800      2636
800-1000     1335
1000>         235
Name: categorical_installment, dtype: int64
loan_status              Charged Off  Fully Paid    All  Paid Percentage  \
categorical_installment                                                    
0-200                           1819       11293  13112        86.127212   
1000>                             25         210    235        89.361702   
200-400                         2061       12647  14708        85.987218   
400-600                         1044        5504   6548        84.056200   
600-800                          432        2204   2636        83.611533   
800-1000                         246        1089   1335        81.573034   
All                             5627       32947  38574        85.412454   

loan_status              Charged Off Percentage  
categorical_installment                          
0-200                                 13.872788  
1000>                                 10.638298  
200-400                               14.012782  
400-600                               15.943800  
600-800                               16.388467  
800-1000                              18.426966  
All                                   14.587546  

Data Cleanup for installment

We will not remove any row for installment as all the rows seem valid.

Insights for installment

correlation between installment and loan_amount is high. more number of customers pay installment around $180. The number of fully paid loans are with installments between 200 to 400

6. Data Analysis for grade

In [445]:
pltUniVariableAnalysis('grade',lending_club)
In [446]:
pltUniVariableAnalysis('grade',lending_club_ChargedOff)

Data Cleanup for grade

Insights for grade

  • Maximum Loan Provided to A, B, C and D Group Subscriber
  • Maximum loan provided to B,C and D Group "Charged Off" subscriber

7. Data Analysis for sub_grade

In [447]:
pltUniVariableAnalysis('sub_grade', lending_club.sort_values(by='sub_grade'))
In [448]:
pltUniVariableAnalysis('sub_grade', lending_club_ChargedOff.sort_values(by='sub_grade'))

Data Cleanup for sub_grade

Insights for sub_grade

  • Most of the Loans are Given in Upper Sub-Grades

8. Data Analysis for emp_length

In [449]:
pltUniVariableAnalysis('emp_length',lending_club)

Insights for emp_length

  • 10+ Experience people have taken more number of loans

9. Data Analysis for home_ownership

In [450]:
pltUniVariableAnalysis('home_ownership',lending_club_master)
In [451]:
pltUniVariableAnalysis('home_ownership', lending_club_ChargedOff)
In [452]:
columnName = 'home_ownership'
title1 = 'Loan Amount Distribution Based on Home Ownership for ChargedOff Subscriber'
title2 = 'Loan Amount Distribution Based on Home Ownership for Fully Paid Subscriber'
pltCategorical(columnName, title1, title2)

Data Cleanup for home_ownership

Insights for home_ownership

  • Most of the Defaulter Home Ownership status is "Mortgage" and "Rent".

10. Data Analysis for annual_inc

In [453]:
do_analysis(lending_club, 'annual_inc')
count    3.857400e+04
mean     6.877704e+04
std      6.422001e+04
min      4.000000e+03
0%       4.000000e+03
10%      3.000000e+04
20%      3.719808e+04
30.0%    4.450000e+04
40%      5.000400e+04
50%      5.886028e+04
60.0%    6.500400e+04
70%      7.500000e+04
80%      9.000000e+04
90%      1.150000e+05
100%     6.000000e+06
max      6.000000e+06
Name: annual_inc, dtype: float64
Number of null values for the column annual_inc : 0

Data Cleanup for annual_inc

In [454]:
# 5th percentile
lending_club.annual_inc.quantile(0.05)
Out[454]:
24000.0
In [455]:
#What percent of values fall below the 5th percentile?
percent = lending_club.loc[lending_club['annual_inc'] < 24000].shape[0] * 100 / lending_club.shape[0]
print("% .2f" % percent)
 4.62
In [456]:
# 95th percentile
lending_club.annual_inc.quantile(0.99)
Out[456]:
234161.99999999808
In [457]:
# Percentage of values that fall in above the 99th percentile (i.e top 1 percentile)
percent = lending_club.loc[lending_club['annual_inc'] > 234999].shape[0] * 100/ lending_club.shape[0]
print("% .2f" % percent)
 1.00
In [458]:
lending_club['annual_inc'].describe()
Out[458]:
count    3.857400e+04
mean     6.877704e+04
std      6.422001e+04
min      4.000000e+03
25%      4.000000e+04
50%      5.886028e+04
75%      8.200000e+04
max      6.000000e+06
Name: annual_inc, dtype: float64

The spread is between around 40,000 and 82,300. We will drop the outliers top 99 percentile. We will not remove the bottom 5 percentile because the lenders can be from the low income group. Also below the 5th percentile, there are 5% of rows. so we will not remove the bottom 5th percentile.

In [459]:
# Removing the top 1 percentile of outliers
lending_club = lending_club.loc[lending_club['annual_inc'] < 234999]

11. Data Analysis for issue_d

In [460]:
lending_club['issue_d']
lending_club['issue_month']=lending_club['issue_d'].apply(lambda x:x.split('-')[0])
lending_club['issue_year']=lending_club['issue_d'].apply(lambda x:x.split('-')[1])
lending_club['issue_year'] = lending_club['issue_year'].apply(pd.to_numeric)
In [461]:
percentileanalysis('issue_year','loan_status')

Data Cleanup for issue_d

No clean up done for issue_d as there are no null values.

12. Data Analysis for loan_status

In [462]:
lending_club_master.pivot_table(values='loan_amnt', index='loan_status', aggfunc='count').plot(kind='bar', stacked=True, figsize=(20, 8), fontsize=15)
Out[462]:
<matplotlib.axes._subplots.AxesSubplot at 0x1b2be8545c0>

Data Cleanup for loan_status

Insights for loan_status

  • Number of Defaulters are Comparatively more than the susbcriber for which running. Fully Paid Loan subscriber Count is Higest

13. Data Analysis for purpose

In [463]:
lending_club_ChargedOff.pivot_table(values='loan_amnt', index='purpose', aggfunc='sum').sort_values(by='loan_amnt', ascending=False).plot(kind='bar', stacked=True, figsize=(20, 8), fontsize=15)
Out[463]:
<matplotlib.axes._subplots.AxesSubplot at 0x1b2b8133358>
In [464]:
columnName = 'purpose'
title1 = 'Loan Amount Distribution Based on "Purpose" of Loan, for ChargedOff Subscriber'
title2 = 'Loan Amount Distribution Based on "Purpose" of Loan, for Fully Paid Subscriber'
pltCategorical(columnName, title1, title2)

Data Cleanup for purpose

Insights for purpose

  • Major purpose of the loan (amount wise) for the defaulters are "Debt Consolidation", "Credit_card" and "Small Business" For ChargedOff Susbcriber
  • For FullPaid Subscriber, 3rd most loan amount category is "Home Improvement"

14. Data Analysis for addr_state

In [465]:
df1_addr_state = lending_club_ChargedOff.pivot_table(values='policy_code', index='addr_state', aggfunc='count').sort_values(by='policy_code', ascending=False)
df1_addr_state.plot(kind='bar', stacked=True, figsize=(20, 8), fontsize=15)
Out[465]:
<matplotlib.axes._subplots.AxesSubplot at 0x1b2b20d6cc0>
In [466]:
df1_addr_state = df1_addr_state.reset_index()
plt.figure(figsize=(15,5))
plt.plot(df1_addr_state.addr_state, df1_addr_state.policy_code, 'o-')
plt.show()
In [467]:
columnName = 'addr_state'
title1 = 'Loan Amount Distribution Based on "addr_state" of Loan, for ChargedOff Subscriber'
title2 = 'Loan Amount Distribution Based on "addr_state" of Loan, for Fully Paid Subscriber'
pltCategorical(columnName, title1, title2)

Data Cleanup for addr_state

Insights for addr_state

  • Top States based on Loan_amount is different based on Loan_status
  • Overal Top3 Categories are:
    • CA
    • FL
    • NY
  • Fully Paid Category top3 are:
    • CA
    • NY
    • TX
  • Charged Off Categories Top3 Are:
    • CA
    • NY
    • FL

15. Data Analysis for dti

In [468]:
do_analysis(lending_club, 'dti')
count    38190.000000
mean        13.333638
std          6.657797
min          0.000000
0%           0.000000
10%          4.130000
20%          7.020000
30.0%        9.330000
40%         11.420000
50%         13.430000
60.0%       15.380000
70%         17.490000
80%         19.750000
90%         22.310000
100%        29.990000
max         29.990000
Name: dti, dtype: float64
Number of null values for the column dti : 0

Data Cleanup for dti

* Since there are no null values, and no outliers for dti, there is no need to do a datacleanup

Insights for dti

  • Insight: The dti for charged off loans is higher than the fully paid loans

16. Data Analysis for earliest_cr_line

In [469]:
lending_club['earliest_cr_line']
lending_club['earliestcrline_month']=lending_club['earliest_cr_line'].apply(lambda x:x.split('-')[0])
lending_club['earliestcrline_year']=lending_club['earliest_cr_line'].apply(lambda x:x.split('-')[1])
lending_club['earliestcrline_year'] = lending_club['earliestcrline_year'].apply(pd.to_numeric)
In [470]:
do_analysis_categorical(lending_club, 'earliestcrline_year')
0     3144
99    2882
98    2756
1     2449
97    2370
96    2178
95    2054
94    1911
2     1881
3     1807
4     1721
5     1465
93    1438
6     1154
92    1016
90     982
91     917
89     816
7      697
88     692
87     577
86     471
85     435
84     381
83     311
82     210
81     177
80     171
78     160
79     154
8      151
77     110
76      92
75      82
74      61
73      60
72      48
70      47
71      44
69      36
68      19
67      17
66      11
63      10
65       7
62       5
64       5
61       2
54       2
59       1
46       1
50       1
56       1
Name: earliestcrline_year, dtype: int64
loan_status          Charged Off  Fully Paid    All  Paid Percentage  \
earliestcrline_year                                                    
0                          496.0      2648.0   3144        84.223919   
1                          354.0      2095.0   2449        85.545120   
2                          254.0      1627.0   1881        86.496544   
3                          224.0      1583.0   1807        87.603763   
4                          228.0      1493.0   1721        86.751888   
5                          238.0      1227.0   1465        83.754266   
6                          224.0       930.0   1154        80.589255   
7                          144.0       553.0    697        79.340029   
8                           28.0       123.0    151        81.456954   
46                           NaN         1.0      1       100.000000   
50                           NaN         1.0      1       100.000000   
54                           NaN         2.0      2       100.000000   
56                           NaN         1.0      1       100.000000   
59                           NaN         1.0      1       100.000000   
61                           NaN         2.0      2       100.000000   
62                           2.0         3.0      5        60.000000   
63                           1.0         9.0     10        90.000000   
64                           NaN         5.0      5       100.000000   
65                           1.0         6.0      7        85.714286   
66                           2.0         9.0     11        81.818182   
67                           5.0        12.0     17        70.588235   
68                           5.0        14.0     19        73.684211   
69                           3.0        33.0     36        91.666667   
70                           7.0        40.0     47        85.106383   
71                           5.0        39.0     44        88.636364   
72                           7.0        41.0     48        85.416667   
73                          11.0        49.0     60        81.666667   
74                           5.0        56.0     61        91.803279   
75                           9.0        73.0     82        89.024390   
76                           4.0        88.0     92        95.652174   
77                          13.0        97.0    110        88.181818   
78                          23.0       137.0    160        85.625000   
79                          31.0       123.0    154        79.870130   
80                          16.0       155.0    171        90.643275   
81                          20.0       157.0    177        88.700565   
82                          32.0       178.0    210        84.761905   
83                          53.0       258.0    311        82.958199   
84                          46.0       335.0    381        87.926509   
85                          66.0       369.0    435        84.827586   
86                          53.0       418.0    471        88.747346   
87                          77.0       500.0    577        86.655113   
88                         105.0       587.0    692        84.826590   
89                         110.0       706.0    816        86.519608   
90                         125.0       857.0    982        87.270876   
91                         139.0       778.0    917        84.841876   
92                         133.0       883.0   1016        86.909449   
93                         193.0      1245.0   1438        86.578581   
94                         272.0      1639.0   1911        85.766614   
95                         301.0      1753.0   2054        85.345667   
96                         298.0      1880.0   2178        86.317723   
97                         371.0      1999.0   2370        84.345992   
98                         411.0      2345.0   2756        85.087083   
99                         439.0      2443.0   2882        84.767523   
All                       5584.0     32606.0  38190        85.378371   

loan_status          Charged Off Percentage  
earliestcrline_year                          
0                                 15.776081  
1                                 14.454880  
2                                 13.503456  
3                                 12.396237  
4                                 13.248112  
5                                 16.245734  
6                                 19.410745  
7                                 20.659971  
8                                 18.543046  
46                                 0.000000  
50                                 0.000000  
54                                 0.000000  
56                                 0.000000  
59                                 0.000000  
61                                 0.000000  
62                                40.000000  
63                                10.000000  
64                                 0.000000  
65                                14.285714  
66                                18.181818  
67                                29.411765  
68                                26.315789  
69                                 8.333333  
70                                14.893617  
71                                11.363636  
72                                14.583333  
73                                18.333333  
74                                 8.196721  
75                                10.975610  
76                                 4.347826  
77                                11.818182  
78                                14.375000  
79                                20.129870  
80                                 9.356725  
81                                11.299435  
82                                15.238095  
83                                17.041801  
84                                12.073491  
85                                15.172414  
86                                11.252654  
87                                13.344887  
88                                15.173410  
89                                13.480392  
90                                12.729124  
91                                15.158124  
92                                13.090551  
93                                13.421419  
94                                14.233386  
95                                14.654333  
96                                13.682277  
97                                15.654008  
98                                14.912917  
99                                15.232477  
All                               14.621629  

Data Cleanup for earliest_cr_line

Since there are no null values no cleanup is needed.

Insights for earliest_cr_line¶

Business for banks increased in terms of number of loans, can be predicted to increase in coming years in similar format.

17. Data Analysis for inq_last_6mths

In [471]:
df1_inq_last_6mths = lending_club_ChargedOff.pivot_table(values='policy_code', index='inq_last_6mths', aggfunc='count').reset_index()
plt.figure(figsize=(15,5))
plt.plot(df1_inq_last_6mths.inq_last_6mths, df1_inq_last_6mths.policy_code, '-o')
plt.show()

Data Cleanup for inq_last_6mths

Insights for inq_last_6mths

* Number of Inquary Count is more from (0 to 4) in last 6 months period

18. Data Analysis for mths_since_last_delinq

In [472]:
do_analysis(lending_club, 'mths_since_last_delinq')
count    13515.000000
mean        35.935627
std         22.029297
min          0.000000
0%           0.000000
10%          8.000000
20%         15.000000
30.0%       21.000000
40%         27.000000
50%         34.000000
60.0%       40.000000
70%         47.000000
80%         57.000000
90%         69.000000
100%       120.000000
max        120.000000
Name: mths_since_last_delinq, dtype: float64
Number of null values for the column mths_since_last_delinq : 24675

Data Cleanup for mths_since_last_delinq

In [473]:
# There are 24675 null values. The null values means that the customer has been repaying the loan on time.
# We will do this data cleanup by imputing these null values to 0.
lending_club.loc[lending_club['mths_since_last_delinq'].isnull(),'mths_since_last_delinq'] = 0
In [474]:
# Analysis of mths_since_last_delinq excluding the 0 values.
lending_club.loc[lending_club['mths_since_last_delinq'] != 0]['mths_since_last_delinq'].plot(kind='hist', bins=50)
Out[474]:
<matplotlib.axes._subplots.AxesSubplot at 0x1b2b480c780>
In [475]:
lending_club.loc[lending_club['mths_since_last_delinq'] != 0][['mths_since_last_delinq', 'loan_status']].boxplot(by='loan_status') 
Out[475]:
<matplotlib.axes._subplots.AxesSubplot at 0x1b2bd783be0>

Insights for mths_since_last_delinq

* Charged off loans have a higher months since last delinquient

19. Data Analysis for open_acc

In [476]:
#do_analysis(lending_club, 'open_acc')
lending_club['open_acc'].plot(kind='hist')
Out[476]:
<matplotlib.axes._subplots.AxesSubplot at 0x1b2b8128908>
In [477]:
#DERIVED 

lending_club['Ratio_Open_Total_acc']=(lending_club['open_acc']/lending_club['total_acc'])*100
#lending_club['Ration_Open_Total_acc']
do_analysis(lending_club, 'Ratio_Open_Total_acc')
count    38190.000000
mean        47.162328
std         18.686415
min          5.128205
0%           5.128205
10%         25.000000
20%         30.952381
30.0%       35.555556
40%         40.000000
50%         44.444444
60.0%       50.000000
70%         55.000000
80%         62.500000
90%         73.333333
100%       175.000000
max        175.000000
Name: Ratio_Open_Total_acc, dtype: float64
Number of null values for the column Ratio_Open_Total_acc : 0

Data Cleanup for open_acc

No Data Clean up done.

Insights for open_acc

1. There are profiles with as 10 open accounts is the highest.
2. There are outliers with more than 20 open accounts .
3. Out of 3000 profiles  around 800 have the open_acc/total_acc ratio as 100% ie all loans are either Current/ChargedOff
4. These profiles should never be considered for next loan request,Also need to check for 75% and above cases.

</b>

20. Data Analysis for pub_rec

In [478]:
#lending_club_master.pivot_table(values='policy_code', index='pub_rec', aggfunc='count', columns=['loan_status']).plot(kind='bar', stacked=True, figsize=(20, 8), fontsize=15)
pltUniVariableAnalysis('pub_rec', lending_club_master)
In [479]:
pltUniVariableAnalysis('pub_rec', lending_club_ChargedOff)

Insights for pub_rec

  • Most of the applicant attitude was fine. Maximum are under 0 complain section. Defaulters are only in the category of no complain or 1 complain zone.

21. Data Analysis for revol_bal

In [480]:
do_analysis(lending_club, 'revol_bal')
count     38190.000000
mean      13077.999869
std       15427.815017
min           0.000000
0%            0.000000
10%        1092.000000
20%        2770.800000
30.0%      4541.700000
40%        6487.000000
50%        8705.500000
60.0%     11316.000000
70%       14605.300000
80%       19397.800000
90%       28541.000000
100%     149588.000000
max      149588.000000
Name: revol_bal, dtype: float64
Number of null values for the column revol_bal : 0

Data Cleanup for revol_bal

In [481]:
# 90% of the revol_bal values are under 28, 541 dollars. The max loan amount is around 35, 000 dollars.
# It is likely that the values above 80, 000 are outliers, and we can remove them.
# Lets look at the percentage of rows where revol_bal is above 80,000
lending_club.loc[lending_club['revol_bal'] > 80000].shape[0] * 100 / lending_club.shape[0]
Out[481]:
1.0002618486514794
In [482]:
# To decide if we can drop these rows we will see if there is a major correlation between
# revol_bal and charged off for values over 60,000
lending_club.loc[lending_club['revol_bal'] > 80000][['revol_bal', 'loan_status']].boxplot(by='loan_status')
Out[482]:
<matplotlib.axes._subplots.AxesSubplot at 0x1b2b806b470>
In [483]:
# Since there is a visible corelation between revolving balance and loan status, we cannot remove the values above 80,000
# The values above 80, 000 seem to be valid values.
In [484]:
lending_club[['revol_bal', 'loan_status']].groupby('loan_status').quantile([0.5])
Out[484]:
revol_bal
loan_status
Charged Off 0.5 9176.5
Fully Paid 0.5 8631.5

Insights for revol_bal

* The median revolving balance for charged off loans is 6.3% higher than the fully paid loans.

In [485]:
# Is revol_bal and revol_util corelated. They are not highly corelated.
lending_club[['revol_bal','revol_util']].corr()
Out[485]:
revol_bal
revol_bal 1.0

22. Data Analysis for total_acc

In [486]:
lending_club['total_acc']
do_analysis(lending_club, 'total_acc')
count    38190.000000
mean        21.974836
std         11.383505
min          2.000000
0%           2.000000
10%          9.000000
20%         12.000000
30.0%       15.000000
40%         17.000000
50%         20.000000
60.0%       23.000000
70%         27.000000
80%         31.000000
90%         37.000000
100%        90.000000
max         90.000000
Name: total_acc, dtype: float64
Number of null values for the column total_acc : 0

Data Cleanup for total_acc

no data clean up done as no null values or incorrect values

Insights for total_acc

There are around 2800 profiles with total accounts around 18 All profiles with charged off/current/Fully paid status have 20 total accounts on an average

23. Data Analysis for out_prncp_inv

In [487]:
lending_club['out_prncp_inv'].value_counts()
do_analysis(lending_club,'out_prncp_inv')
count    38190.0
mean         0.0
std          0.0
min          0.0
0%           0.0
10%          0.0
20%          0.0
30.0%        0.0
40%          0.0
50%          0.0
60.0%        0.0
70%          0.0
80%          0.0
90%          0.0
100%         0.0
max          0.0
Name: out_prncp_inv, dtype: float64
Number of null values for the column out_prncp_inv : 0

Data Cleanup for out_prncp_inv

In [488]:
#since most of the outstanding prnc is 0 need to filter out to get better insights
lending_club_temp=lending_club.loc[lending_club['out_prncp_inv'] > 0]
#lending_club_temp['out_prncp_inv']
#do_analysis(lending_club_temp,'out_prncp_inv')

Insights for out_prncp_inv

Most of the profiles have dollars 600 as the outstanding_prncp_inv while the next peak is at dollars 1200 . Highest outstanding prncipal is around dollars 6300

24. Data Analysis for total_pymnt

In [489]:
print("Total Number of Records: ", lending_club_master['total_pymnt'].shape[0])
print("Total Unique Counts of the Records: ", len(lending_club_master['total_pymnt'].unique()))
do_analysis(lending_club, 'total_pymnt')
Total Number of Records:  39714
Total Unique Counts of the Records:  37847
count    38190.000000
mean     11767.212967
std       8696.546176
min          0.000000
0%           0.000000
10%       2903.992589
20%       4671.019472
30.0%     6093.307181
40%       7673.605898
50%       9614.195000
60.0%    11678.646520
70%      14243.160989
80%      17785.408670
90%      23882.842285
100%     58563.679930
max      58563.679930
Name: total_pymnt, dtype: float64
Number of null values for the column total_pymnt : 0
In [490]:
def splitTotalPayment(x):
    if x <= 5000:
        return "Low"
    elif x <= 15000:
        return "Medium"
    else:
        return "High"
lending_club_master['Temp_Total_Payment'] = lending_club_master['total_pymnt'].apply(splitTotalPayment)
pltUniVariableAnalysis('Temp_Total_Payment', lending_club_master)
lending_club_master.drop(columns='Temp_Total_Payment', inplace=True)

Insights for total_pymnt

  • Maximum total_payment within the Range of 5K to 15K
  • Average Total Payment is more for Fully Paid Subscriber than ChargedOff Susbcriber. </b>

25. Data Analysis for total_rec_int

In [491]:
#lending_club['total_rec_int'].value_counts()
do_analysis(lending_club,'total_rec_int')
count    38190.000000
mean      2098.274761
std       2355.126239
min          0.000000
0%           0.000000
10%        308.655000
20%        528.794000
30.0%      751.287000
40%        994.722000
50%       1291.920000
60.0%     1692.964000
70%       2265.927000
80%       3151.934000
90%       4841.713000
100%     23563.680000
max      23563.680000
Name: total_rec_int, dtype: float64
Number of null values for the column total_rec_int : 0

Data Cleanup for total_rec_int

Though there are outliers , removing them will not have a significant impact hence retaining them.

Insights for total_rec_int

High number of profiles around 6500 have paid dollars 0.0 interest : reason has to be found and such profiles should be handled sepearately. There are few profiles having paid around dollars 23000 : can checked if principal

26. Data Analysis for total_rec_late_fee

In [492]:
checkNullPerForColumns('total_rec_late_fee')
totalChargedOffSub = lending_club_ChargedOff['total_rec_late_fee'].shape[0]
positiveLatePayment = len(lending_club_ChargedOff[lending_club_ChargedOff['total_rec_late_fee'] > 0])
print("Total Number of Subscriber Records: ==>", totalChargedOffSub)
print("Total Unique Counts of the Records: ==>", len(lending_club_ChargedOff['total_rec_late_fee'].unique()))
print("Total Number of +ve Late Payment, ChargedOff Subscriber: ==>",positiveLatePayment)
print("% of ChargedOff Subscriber Done Late Payment: ==>",  round((positiveLatePayment/totalChargedOffSub)*100, 0))
print(lending_club_master.groupby(['loan_status'])['total_rec_late_fee'].agg(['count', 'sum']))
Null Percentage of the Column is: ==>  0.0
Total Number of Subscriber Records: ==> 5627
Total Unique Counts of the Records: ==> 829
Total Number of +ve Late Payment, ChargedOff Subscriber: ==> 863
% of ChargedOff Subscriber Done Late Payment: ==> 15.0
             count           sum
loan_status                     
Charged Off   5627  24149.351283
Current       1140   1337.080719
Fully Paid   32947  28648.443189

Data Cleanup for total_rec_late_fee

Insights for total_rec_late_fee

  • Maximum LatePayment count is under "FullyPaid" Section
  • 15% of the ChargedOff Subscriber Does the Late Payment
  • Not a direct impact to the ChargedOff Subscriber, Need to use this field along with other Money Related Fields.

27. Data Analysis for recoveries

In [493]:
do_analysis(lending_club, 'recoveries')
count    38190.000000
mean        97.606055
std        695.965913
min          0.000000
0%           0.000000
10%          0.000000
20%          0.000000
30.0%        0.000000
40%          0.000000
50%          0.000000
60.0%        0.000000
70%          0.000000
80%          0.000000
90%         16.383000
100%     29623.350000
max      29623.350000
Name: recoveries, dtype: float64
Number of null values for the column recoveries : 0
In [494]:
# Recovery is non zero for only charged off loans. So we will analyse recovery for Charged Off loans
lending_club.loc[lending_club['recoveries'] > 0]['recoveries'].plot(kind='hist')
Out[494]:
<matplotlib.axes._subplots.AxesSubplot at 0x1b2bd78b630>

Data Cleanup for recoveries

In [495]:
# There are no null values for recoveries. Therefore there is no need for data cleanup.
# Recoveries is 0 for fully paid loans

Insights for recoveries

In [496]:
lending_club[['recoveries', 'loan_status']].groupby('loan_status').quantile([.5])
Out[496]:
recoveries
loan_status
Charged Off 0.5 172.965
Fully Paid 0.5 0.000

  • Recoveries are 0 for Fully Paid and median recovery is 172 for charged off loans
  • Since Recovery happens after the loan defaults, this cannot be an independant variable when it comes to loan status </b>

28. Data Analysis for last_pymnt_d

In [497]:
lending_club['last_pymnt_d']
lending_club['last_pymnt_d']=lending_club['last_pymnt_d'].astype(str)
lending_club['last_pymnt_month']=lending_club['last_pymnt_d'].apply(lambda x:x.split('-')[0])
#lending_club['last_pymnt_year']=lending_club['last_pymnt_d'].apply(lambda x:x.split('-')[1] if x.find("-") else 0)
#lending_club['last_pymnt_year'].apply(pd.to_numeric)
#lending_club['last_pymnt_month']
#lending_club['last_pymnt_d']
#lending_club['last_pymnt_year']
#lending_club['last_pymnt_year']=lending_club['last_pymnt_d'].apply(lambda x:x.split('-')[1])

29. Data Analysis for last_pymnt_amnt

In [498]:
checkNullPerForColumns('last_pymnt_amnt')
totalChargedOffSub = lending_club_ChargedOff['last_pymnt_amnt'].shape[0]
positiveLastPaymnt = len(lending_club_ChargedOff[lending_club_ChargedOff['last_pymnt_amnt'] > 0])
print("Total Number of Subscriber Records: ==>", totalChargedOffSub)
print("Total Unique Counts of the Records: ==>", len(lending_club_ChargedOff['last_pymnt_amnt'].unique()))
print("Total Number of +ve Late Payment, ChargedOff Subscriber: ==>",positiveLastPaymnt)

do_analysis(lending_club_ChargedOff, 'last_pymnt_amnt')
Null Percentage of the Column is: ==>  0.0
Total Number of Subscriber Records: ==> 5627
Total Unique Counts of the Records: ==> 4591
Total Number of +ve Late Payment, ChargedOff Subscriber: ==> 5556
count     5627.000000
mean       326.020833
std        528.983051
min          0.000000
0%           0.000000
10%         43.960000
20%         91.928000
30.0%      138.020000
40%        186.004000
50%        238.250000
60.0%      300.636000
70%        367.050000
80%        471.584000
90%        648.600000
100%     12818.380000
max      12818.380000
Name: last_pymnt_amnt, dtype: float64
Number of null values for the column last_pymnt_amnt : 0
In [499]:
def splitLatePayment(x):
    if x <= 5000:
        return "Low"
    elif x <= 15000:
        return "Medium"
    else:
        return "High"
lending_club_master['Temp_Total_Payment'] = lending_club_master['total_pymnt'].apply(splitTotalPayment)
pltUniVariableAnalysis('Temp_Total_Payment', lending_club_master)
lending_club_master.drop(columns='Temp_Total_Payment', inplace=True)

30. Data Analysis for collections_12_mths_ex_med

In [500]:
lending_club=lending_club[lending_club.collections_12_mths_ex_med.notnull()]
lending_club['collections_12_mths_ex_med'].isnull().sum()
Out[500]:
0
In [501]:
do_analysis_categorical(lending_club,'collections_12_mths_ex_med')
#lending_club['collections_12_mths_ex_med']
0.0    38137
Name: collections_12_mths_ex_med, dtype: int64
loan_status                 Charged Off  Fully Paid    All  Paid Percentage  \
collections_12_mths_ex_med                                                    
0.0                                5578       32559  38137        85.373784   
All                                5578       32559  38137        85.373784   

loan_status                 Charged Off Percentage  
collections_12_mths_ex_med                          
0.0                                      14.626216  
All                                      14.626216  

31. Data Analysis for int_rate

In [502]:
do_analysis_with_plotly_graphs(lending_club, 'int_rate', 'Interest Rate')
count    38137.000000
mean        11.912342
std          3.677651
min          5.000000
0%           5.000000
10%          7.000000
20%          8.000000
30.0%       10.000000
40%         11.000000
50%         12.000000
60.0%       13.000000
70%         14.000000
80%         15.000000
90%         17.000000
100%        24.000000
max         24.000000
Name: int_rate, dtype: float64
Number of null values for the column int_rate : 0

Insights for int_rate

Borrowers who defaulted usually had a higher interest rate compared to those that did not.

32. Data Analysis for emp_title

Even tho the discriptions for the row emp_title says its the job title supplied by the person taking the loan it turns out to be the name of the employer.

Lets start by cleaning the data a bit and then looking at it.

To clean the data the process will be to see the different ways the loan applicants have spelled the different employers and fix those. This will be an iterative process.

Data cleanup for emp_title

In [503]:
lending_club.emp_title = lending_club.emp_title.str.lower()
lending_club.emp_title = lending_club.emp_title.str.strip()
lending_club.emp_title = lending_club.emp_title.str.replace('united states army', 'us army')
lending_club.emp_title = lending_club.emp_title.str.replace('u.s. army', 'us army')
lending_club.emp_title = lending_club.emp_title.str.replace('self-employed', 'self employed')
lending_club.emp_title = lending_club.emp_title.str.replace('self -employed', 'self employed')
lending_club.emp_title = lending_club.emp_title.str.replace('self employed', 'self')
lending_club.emp_title = lending_club.emp_title.str.replace('wells fargo bank', 'wells fargo')
lending_club.emp_title = lending_club.emp_title.str.replace('jpmorgan chase', 'jp morgan chase')
lending_club.emp_title = lending_club.emp_title.str.replace('united states postal service', 'usps')
lending_club.emp_title = lending_club.emp_title.str.replace('us air force', 'usaf')
In [504]:
lending_club.emp_title = lending_club.emp_title.str.replace('united states air force', 'usaf')
lending_club.emp_title = lending_club.emp_title.str.replace('united states navy', 'us navy')
lending_club.emp_title = lending_club.emp_title.str.replace('us postal service', 'usps')
lending_club.emp_title = lending_club.emp_title.str.replace('united parcel service', 'ups')
lending_club.emp_title = lending_club.emp_title.str.replace('wal-mart', 'walmart')
lending_club.emp_title = lending_club.emp_title.str.replace('department of defence (dod)', 'department of defence')
lending_club.emp_title = lending_club.emp_title.str.replace('department of defence, usaf', 'department of defence')
lending_club.emp_title = lending_club.emp_title.str.replace('department of defence oig', 'department of defence')
lending_club.emp_title = lending_club.emp_title.str.replace('department of defence - nawctsd', 'department of defence')
lending_club.emp_title = lending_club.emp_title.str.replace('united states marine corps', 'usmc')
In [505]:
employers = lending_club.groupby('emp_title').count()
employers=employers.sort_values(by=['id'], ascending=False)
employers['id'].head(50)
Out[505]:
emp_title
us army                            285
usaf                               157
self                               148
usps                               138
bank of america                    133
wells fargo                        108
walmart                            103
ups                                 96
jp morgan chase                     83
at&t                                81
us navy                             69
kaiser permanente                   68
ibm                                 65
verizon wireless                    65
walgreens                           51
lockheed martin                     46
home depot                          44
department of defense               43
state of california                 43
target                              42
verizon                             42
usmc                                39
comcast                             36
booz allen hamilton                 36
northrop grumman                    35
best buy                            35
accenture                           33
retired                             33
fidelity investments                31
morgan stanley                      28
department of homeland security     27
social security administration      27
general electric                    26
citigroup                           25
department of veterans affairs      25
merrill lynch                       24
sprint                              23
the home depot                      23
raytheon                            23
us government                       23
internal revenue service            22
military                            22
columbia university                 22
american express                    21
us bank                             21
pricewaterhousecoopers              21
costco wholesale                    20
adp                                 19
american airlines                   19
bae systems                         19
Name: id, dtype: int64

Insights from emp_title

Looking at this data its quite clear that people that work for the goverment be it in the military or postal service or other goverment agencies seem to taking the maximum number of loans.

33. Analysis of verification_status

In [506]:
do_analysis_categorical(lending_club, 'verification_status')
Not Verified       16548
Verified           12032
Source Verified     9557
Name: verification_status, dtype: int64
loan_status          Charged Off  Fully Paid    All  Paid Percentage  \
verification_status                                                    
Not Verified                2124       14424  16548        87.164612   
Source Verified             1422        8135   9557        85.120854   
Verified                    2032       10000  12032        83.111702   
All                         5578       32559  38137        85.373784   

loan_status          Charged Off Percentage  
verification_status                          
Not Verified                      12.835388  
Source Verified                   14.879146  
Verified                          16.888298  
All                               14.626216  

Insights on verification_status

From the above data and chart it is very clear that verified users are more likely not to pay back their loans.

34. Analysis of delinq_2yrs

In [507]:
do_analysis_categorical(lending_club, 'delinq_2yrs')
0     34020
1      3155
2       657
3       208
4        58
5        21
6        10
7         4
8         2
11        1
9         1
Name: delinq_2yrs, dtype: int64
loan_status  Charged Off  Fully Paid    All  Paid Percentage  \
delinq_2yrs                                                    
0                 4893.0     29127.0  34020        85.617284   
1                  503.0      2652.0   3155        84.057052   
2                  128.0       529.0    657        80.517504   
3                   35.0       173.0    208        83.173077   
4                   13.0        45.0     58        77.586207   
5                    3.0        18.0     21        85.714286   
6                    1.0         9.0     10        90.000000   
7                    1.0         3.0      4        75.000000   
8                    1.0         1.0      2        50.000000   
9                    NaN         1.0      1       100.000000   
11                   NaN         1.0      1       100.000000   
All               5578.0     32559.0  38137        85.373784   

loan_status  Charged Off Percentage  
delinq_2yrs                          
0                         14.382716  
1                         15.942948  
2                         19.482496  
3                         16.826923  
4                         22.413793  
5                         14.285714  
6                         10.000000  
7                         25.000000  
8                         50.000000  
9                          0.000000  
11                         0.000000  
All                       14.626216  

Insights from delinq_2yrs

From the above data it is clear that users that have been delinquent in the last 2 years are more likely not to pay back their loan.

Data Cleanup of revol_util

In [508]:
lending_club.revol_util =  lending_club.revol_util.str.replace('%', '')
lending_club.revol_util = pd.to_numeric(lending_club.revol_util)
lending_club_revol_util = lending_club.dropna(subset=['revol_util'])

35. Analysis of revol_util

In [509]:
do_analysis_with_plotly_graphs(lending_club_revol_util, 'revol_util', 'revoling utilization')
count    38088.000000
mean        48.718465
std         28.331883
min          0.000000
0%           0.000000
10%          8.500000
20%         20.000000
30.0%       30.300000
40%         39.900000
50%         49.100000
60.0%       58.200000
70%         67.500000
80%         77.200000
90%         87.800000
100%        99.900000
max         99.900000
Name: revol_util, dtype: float64
Number of null values for the column revol_util : 0

Insights of revol_util

What is very interesting here is that the distribution of charged off loans increases after the utilization crosses 50% and it really pickes up after 70%

36. Analysis of pub_rec_bankruptcies

In [510]:
do_analysis_categorical(lending_club, 'pub_rec_bankruptcies')
0.0    35867
1.0     1631
2.0        5
Name: pub_rec_bankruptcies, dtype: int64
loan_status           Charged Off  Fully Paid    All  Paid Percentage  \
pub_rec_bankruptcies                                                    
0.0                          5101       30766  35867        85.778013   
1.0                           364        1267   1631        77.682403   
2.0                             2           3      5        60.000000   
All                          5467       32036  37503        85.422500   

loan_status           Charged Off Percentage  
pub_rec_bankruptcies                          
0.0                                14.221987  
1.0                                22.317597  
2.0                                40.000000  
All                                14.577500  

Insights from pub_rec_bankruptcies

Users with a public record of a bankruptcies are more likely not to pay their loans.

---- SECTION 4 (Correlation analysis, Bivariate Analysis, Derived Variables) ---

1) Income Level

In [511]:
# Creating a function to segment customers based on their income levels
# LOW is less than 30, 000, MID is between 30,000 and 90, 000, HIGH is above 90, 000

def get_income_segment(income):
    if(income < 30000):
        return 'LOW'
    if((income >= 30000) & (income < 90000)):
        return 'MID'
    if(income >= 90000):
        return 'HIGH'
In [512]:
#Segment the customers by income level
lending_club['income_level'] = lending_club.apply(lambda row : get_income_segment(row['annual_inc']), axis=1)
In [513]:
# Segmented Univariate Analysis
pd.pivot_table(lending_club, values = 'policy_code', index = ['income_level'], columns = ['loan_status'], aggfunc = np.sum)
Out[513]:
loan_status Charged Off Fully Paid
income_level
HIGH 839 6790
LOW 700 3033
MID 4039 22736
In [514]:
# Segmented Univariate Analysis
pd.pivot_table(lending_club, values = 'policy_code', index = ['loan_status'], columns = ['income_level'], aggfunc = np.sum).plot(kind='pie', subplots=True, figsize=(24, 8))
Out[514]:
array([<matplotlib.axes._subplots.AxesSubplot object at 0x000001B2B339EBE0>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x000001B2BCD4CB70>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x000001B2B397C080>],
      dtype=object)
In [515]:
# Segmented Univariate Analysis. todo create barcharts so that all bars are of same height
pd.pivot_table(lending_club, values = 'policy_code', index = ['income_level'], columns = ['loan_status'], aggfunc = np.sum).plot(kind='bar', stacked=True, figsize=(24, 8))
Out[515]:
<matplotlib.axes._subplots.AxesSubplot at 0x1b2b7fd1d30>

INSIGHT : This pivot table gives the percentage of charged off (defaults) in each income segment.

10% : HIGH Income segment have 841/(6808 + 218 + 841)
14% : MID Income Segment have 4043 / (4043 + 789 + 22758)
18% : LOW Income Segment has 700 / (700 + 56 + 3043)
</font> This shows that the Loan default increases in the lower income segment.

What is the distribution of loan amount in the lower income segment when customer defaults

In [516]:
lending_club.loc[(lending_club['income_level'] == 'LOW') & (lending_club['loan_status'] == 'Charged Off')]['loan_amnt'].plot(kind='hist', bins=50)
Out[516]:
<matplotlib.axes._subplots.AxesSubplot at 0x1b2b20a70f0>

2) Loan Amount Funded Amount difference

In [517]:
# I also want to analyse if the people who ask more, but get less. 
# Is there is corelation between this difference and chargeoff

lending_club['asking_lending_difference'] = lending_club['loan_amnt'] - lending_club['funded_amnt']
In [518]:
do_analysis(lending_club, 'asking_lending_difference')
count    38137.000000
mean       259.481606
std       1386.222204
min          0.000000
0%           0.000000
10%          0.000000
20%          0.000000
30.0%        0.000000
40%          0.000000
50%          0.000000
60.0%        0.000000
70%          0.000000
80%          0.000000
90%          0.000000
100%     21025.000000
max      21025.000000
Name: asking_lending_difference, dtype: float64
Number of null values for the column asking_lending_difference : 0
In [519]:
# Where asking_lending_difference is greater than 0
lending_club[['asking_lending_difference', 'loan_status']].loc[lending_club['asking_lending_difference'] > 0].boxplot(by='loan_status')
Out[519]:
<matplotlib.axes._subplots.AxesSubplot at 0x1b2b20c1f60>

Insight

The charged off loans had asked for a higher amount but got lower loan amount.

In [520]:
run_t_test(lending_club, 'asking_lending_difference')
Ttest_indResult(statistic=5.107858716838854, pvalue=3.2739700637100517e-07)

--- Multivariate analysis using a dendrogram

In [521]:
corr = lending_club[['loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'installment', 
                     'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp',
                    'total_rec_int', 'annual_inc', 'revol_bal','last_pymnt_amnt',
                     'total_rec_late_fee', 'recoveries', 'collection_recovery_fee','dti']].corr()
In [522]:
fig, ax = plt.subplots(figsize=(15,15))
ax.matshow(corr, cmap=cm.get_cmap('coolwarm'), vmin=0, vmax=1)
plt.xticks(range(len(corr.columns)), corr.columns, rotation='vertical', fontsize=16);
plt.yticks(range(len(corr.columns)), corr.columns, fontsize=16);
In [523]:
Z = linkage(corr, 'average')
In [524]:
plt.figure(figsize=(25, 10))
labelsize=20
ticksize=15
plt.title('Hierarchical Clustering Dendrogram for loan variables', fontsize=labelsize)
plt.xlabel('variable', fontsize=labelsize)
plt.ylabel('distance', fontsize=labelsize)
dendrogram(
    Z,
    leaf_rotation=90.,  # rotates the x axis labels
    leaf_font_size=8.,  # font size for the x axis labels
    labels = corr.columns
)
pylab.yticks(fontsize=ticksize)
pylab.xticks(rotation=-90, fontsize=ticksize)
#plt.savefig('img/dendogram_'+index+'.png')
plt.show()
In [525]:
fig, ax = plt.subplots(figsize=(17,17))

colours = {'Fully Paid':'green', 'Charged Off':'red', 'Current':'blue'}
ax.scatter(lending_club['loan_amnt'], lending_club['annual_inc'], c=lending_club['loan_status'].apply(lambda x: colours[x]), alpha=0.5)
Out[525]:
<matplotlib.collections.PathCollection at 0x1b2b38cce10>

Insight

Plot to see if there is a co-relation between the loan amount and annual income, and see if there are any clusters
of defaults. We see that the loan amount is co-related to annual income, however there is no pattern or clusters of
defaults
At lower income levels - below 10000 USD and higher loan amounts - above 15, 000 USD, there seems to be a concentration of defaults.

3) Analyse Grade, SubGrade, Interest Rate with Respect to Other Columns (BiVariate)

In [526]:
#Grade, Sub-Grade to Loan Percentage, As per the Graph, Upper Grade is Having Higher Interest Rate
lending_club.groupby(['grade', 'sub_grade'])['int_rate'].agg(['mean']).sort_values(by='mean').plot(kind='bar', figsize=(18,7))
Out[526]:
<matplotlib.axes._subplots.AxesSubplot at 0x1b2ad096f60>

3.1 Derived Columns (Grade_SubGragde, LateFeePrecent)

In [527]:
#Create a Single Column by Merging grade and subgrade
lending_club['grade_subgrade'] = lending_club['grade'] + "_" + lending_club['sub_grade']
lending_club['grade_subgrade'].nunique()
Out[527]:
35
In [528]:
#Create a Derive Column for the LateFeePercent With Respect to TotalPayment
lending_club['lateFeePrecent'] = round((lending_club['total_rec_late_fee']/lending_club['total_pymnt'])*100,2)
In [529]:
X1 = lending_club.groupby('loan_status'). lateFeePrecent.agg(['mean']).reset_index()
plt.figure(figsize=(8,8))
sns.barplot(x = 'loan_status', y='mean', data = X1)
plt.xlabel('Loan Status', fontsize = 12)
plt.ylabel('Late Fee Perecent', fontsize = 12)
plt.show()

Insight

  • Late Fee Percentage is Higer for ChargedOff Subscriber than FullyPaid Subcriber
In [530]:
#Get Average Interest Rate Based on grade and Subgrade
group_subgroup_intRate = pd.DataFrame(lending_club.groupby('grade_subgrade').int_rate.agg(['mean']).reset_index())
#Merge the new Column AvgIntRate to the main Frame Based on grade-subgrade
lending_club = pd.merge(lending_club,group_subgroup_intRate, how = 'inner', on = ['grade_subgrade']).rename(columns = {'mean':'avgIntRate'})
In [531]:
plt.figure(figsize=(8,8))
sns.boxplot('loan_status', 'avgIntRate', data = lending_club)
plt.xlabel('Loan Status', fontsize = 12)
plt.ylabel('Average Interest Rate', fontsize = 12)
plt.show()

Insight

  • Average Ineterest Rate Based on Group/SubGroup is Higher for Charged Off Subscriber than Fully Paid Subscriber
In [532]:
#Grade impact on LateFee Precentage
X1 = createDF_LateFeePrecent_Categories('grade')
X1.head()
Out[532]:
loan_status grade Charged Off Fully Paid LateFeePercent_diff
0 A 0.164210 0.006156 0.158054
2 C 0.166059 0.015965 0.150094
1 B 0.137014 0.011589 0.125425
3 D 0.134229 0.014664 0.119565
6 G 0.095354 0.015579 0.079775
In [533]:
#Plot Grade impact on LateFee Precentage based on LoanCategory
pltLateFeePercentOnCategies('grade',X1)

Insight

  • In following top 3 category(grade) where ChargedOff Suscriber give more latefee than Fullypaid subscriber
    • A
    • C
    • B
In [534]:
#grade_subgrade impact on LateFee Precentage
X1 = createDF_LateFeePrecent_Categories('grade_subgrade')
X1.head()
Out[534]:
loan_status grade_subgrade Charged Off Fully Paid LateFeePercent_diff
6 B_B2 0.287511 0.010919 0.276592
3 A_A4 0.221798 0.006780 0.215018
14 C_C5 0.232560 0.019527 0.213034
0 A_A1 0.207000 0.006921 0.200079
13 C_C4 0.202153 0.017454 0.184699
In [535]:
#Plot grade_subgrade impact on LateFee Precentage based on LoanCategory
pltLateFeePercentOnCategies('grade_subgrade',X1)

Insight

  • In following top 3 category(grade_subgrade) where ChargedOff Suscriber give more latefee than Fullypaid subscriber
    • B_B2
    • A_A4
    • C_C5

4) Analyse "Home Ownership" with respect to Other Columns (BiVariate)

In [536]:
#home_ownership impact on LateFee Precentage
X1 = createDF_LateFeePrecent_Categories('home_ownership')
X1.head()
Out[536]:
loan_status home_ownership Charged Off Fully Paid LateFeePercent_diff
2 OWN 0.183790 0.018029 0.165761
3 RENT 0.151760 0.014262 0.137499
0 MORTGAGE 0.110798 0.006947 0.103851
1 OTHER 0.034444 0.020513 0.013932
In [537]:
#Plot home_ownership impact on LateFee Precentage based on LoanCategory
pltLateFeePercentOnCategies('home_ownership',X1)

Insight

  • In following top 3 category (Home_Ownership) where ChargedOff Suscriber give more latefee than Fullypaid subscriber
    • Own
    • Rent
    • Mortgage

5) Analyse "purpose" with respect to Other Columns (BiVariate)

In [538]:
#purpose impact on LateFee Precentage
X1 = createDF_LateFeePrecent_Categories('purpose')
X1.head()
Out[538]:
loan_status purpose Charged Off Fully Paid LateFeePercent_diff
12 vacation 0.258113 0.028687 0.229426
9 other 0.247818 0.021148 0.226671
7 medical 0.239623 0.020000 0.219623
6 major_purchase 0.225727 0.010618 0.215109
11 small_business 0.173582 0.012879 0.160703
In [539]:
#Plot purpose impact on LateFee Precentage based on LoanCategory
pltLateFeePercentOnCategies('purpose',X1)

Insight

  • In following top 3 category(Purpose) where ChargedOff Suscriber give more latefee than Fullypaid subscriber
    • Vacation
    • Other
    • Medical

6) Analyse "addr_state" with respect to Other Columns (BiVariate)

In [540]:
#addr_state impact on LateFee Precentage
X1 = createDF_LateFeePrecent_Categories('addr_state')
X1.head()
Out[540]:
loan_status addr_state Charged Off Fully Paid LateFeePercent_diff
31 NM 0.36800 0.004295 0.363705
35 OK 0.30600 0.009959 0.296041
41 TN 0.39000 0.109333 0.280667
8 DE 0.27250 0.013267 0.259233
27 NC 0.26115 0.005780 0.255370
In [541]:
#Plot purpose impact on LateFee Precentage based on LoanCategory
pltLateFeePercentOnCategies('addr_state',X1)

Insight

  • In following top 3 category(addr_state) where ChargedOff Suscriber give more latefee than Fullypaid subscriber
    • NM
    • OK
    • TN

7) Analyse "inq_last_6mths" with respect to Other Columns (BiVariate)

In [542]:
#inq_last_6mths impact on LateFee Precentage
X1 = createDF_LateFeePrecent_Categories('inq_last_6mths')
X1.head()
Out[542]:
loan_status inq_last_6mths Charged Off Fully Paid LateFeePercent_diff
7 7 0.312000 0.053478 0.258522
3 3 0.193931 0.015517 0.178414
0 0 0.137266 0.009927 0.127339
5 5 0.123704 0.005789 0.117914
1 1 0.124882 0.012517 0.112365
In [543]:
#Plot purpose impact on LateFee Precentage based on LoanCategory
pltLateFeePercentOnCategies('inq_last_6mths',X1)

Insight

  • In following top 3 category(inq_last_6mths) where ChargedOff Suscriber give more latefee than Fullypaid subscriber
    • 7 (Last 6 Month Inquary Count)
    • 3 (Last 6 Month Inquary Count)
    • 0 (Last 6 Month Inquary Count)

8) Analyse "pub_rec" with respect to Other Columns (BiVariate)

In [544]:
#pub_rec impact on LateFee Precentage
X1 = createDF_LateFeePrecent_Categories('pub_rec')
X1.head()
Out[544]:
loan_status pub_rec Charged Off Fully Paid LateFeePercent_diff
0 0 0.142327 0.011198 0.131129
1 1 0.079268 0.013628 0.065640
2 2 0.028000 0.015405 0.012595
3 3 NaN 0.041429 NaN
4 4 NaN 0.000000 NaN
In [545]:
#Plot purpose impact on LateFee Precentage based on LoanCategory
pltLateFeePercentOnCategies('pub_rec',X1)

Insight

  • In following top 3 category(pub_rec) where ChargedOff Suscriber give more latefee than Fullypaid subscriber
    • 0 (Pub Rec)
    • 1 (Pub Rec)
    • 2 (Pub Rec)
  • Charged-Off Susbcriber information are not present mostly in pub-rec

-------- Section:5 Logistic Regression for Stastical Inference -------

In [546]:
def convert_loan_status_to_binary(loan_status):
    if(loan_status == 'Charged Off'):
        return 1;
    if(loan_status == 'Fully Paid'):
        return 0
    
lending_club['loan_status_binary'] = lending_club['loan_status'].apply(lambda x:convert_loan_status_to_binary(x))
In [547]:
# Normalize the columns so that they lie between 0 and 1
# This will help ensure that the coefficients of logistic regression follow the same scale for comparison
lending_club_normalized = lending_club.copy()
#int_rate, revol_util
for feature_name in ['annual_inc', 'loan_amnt', 'dti', 'revol_bal', 'delinq_2yrs', 'pub_rec_bankruptcies', 'funded_amnt', 'open_acc']:
    max_value = lending_club[feature_name].max()
    min_value = lending_club[feature_name].min()
    lending_club_normalized[feature_name] = (lending_club[feature_name] - min_value) / (max_value - min_value)
In [548]:
#int_rate + revol_util
lreg = smf.logit(formula ='loan_status_binary ~ annual_inc + loan_amnt  + dti + revol_bal + C(income_level) + delinq_2yrs +  pub_rec_bankruptcies + funded_amnt + C(term)  + open_acc', data = lending_club_normalized).fit()
Optimization terminated successfully.
         Current function value: 0.394756
         Iterations 6
In [549]:
print(lreg.summary2())
                            Results: Logit
=======================================================================
Model:                Logit               No. Iterations:    6.0000    
Dependent Variable:   loan_status_binary  Pseudo R-squared:  0.049     
Date:                 2018-07-29 17:22    AIC:               29633.0648
No. Observations:     37503               BIC:               29735.4509
Df Model:             11                  Log-Likelihood:    -14805.   
Df Residuals:         37491               LL-Null:           -15575.   
Converged:            1.0000              Scale:             1.0000    
-----------------------------------------------------------------------
                        Coef.  Std.Err.    z     P>|z|   [0.025  0.975]
-----------------------------------------------------------------------
Intercept              -1.9604   0.1116 -17.5587 0.0000 -2.1792 -1.7416
C(income_level)[T.LOW]  0.2448   0.1005   2.4360 0.0148  0.0478  0.4418
C(income_level)[T.MID]  0.0148   0.0683   0.2172 0.8281 -0.1190  0.1487
C(term)[T. 60 months]   0.9769   0.0334  29.2578 0.0000  0.9114  1.0423
annual_inc             -2.0937   0.2185  -9.5824 0.0000 -2.5219 -1.6654
loan_amnt               0.1915   0.3394   0.5643 0.5725 -0.4737  0.8568
dti                     0.1828   0.0750   2.4376 0.0148  0.0358  0.3298
revol_bal               0.9879   0.1733   5.6994 0.0000  0.6482  1.3276
delinq_2yrs             1.5692   0.3097   5.0661 0.0000  0.9621  2.1763
pub_rec_bankruptcies    1.1315   0.1256   9.0090 0.0000  0.8853  1.3776
funded_amnt             0.4161   0.3484   1.1944 0.2323 -0.2667  1.0990
open_acc               -0.2069   0.1617  -1.2798 0.2006 -0.5238  0.1100
=======================================================================

In [550]:
print("Sorted odds ratios of each variable")
print((np.exp(lreg.params)).sort_values(ascending=False))
Sorted odds ratios of each variable
delinq_2yrs               4.802949
pub_rec_bankruptcies      3.100253
revol_bal                 2.685587
C(term)[T. 60 months]     2.656181
funded_amnt               1.516077
C(income_level)[T.LOW]    1.277359
loan_amnt                 1.211116
dti                       1.200563
C(income_level)[T.MID]    1.014943
open_acc                  0.813090
Intercept                 0.140805
annual_inc                0.123233
dtype: float64
In [551]:
print("lower and upper confidence intervals")
print(lreg.conf_int())
lower and upper confidence intervals
                               0         1
Intercept              -2.179208 -1.741558
C(income_level)[T.LOW]  0.047838  0.441751
C(income_level)[T.MID] -0.119014  0.148679
C(term)[T. 60 months]   0.911448  1.042331
annual_inc             -2.521912 -1.665439
loan_amnt              -0.473689  0.856774
dti                     0.035815  0.329766
revol_bal               0.648169  1.327631
delinq_2yrs             0.962134  2.176326
pub_rec_bankruptcies    0.885323  1.377645
funded_amnt            -0.266746  1.098998
open_acc               -0.523800  0.109974

--------- Section:6 Most Affected Columns & Univariate Analysis-------------

1. delinq_2yrs

In [552]:
categorical_Percentage_Distribution('delinq_2yrs', '% Distribution of delinq_2yrs and ChargedOff Subscriber% on Total')
   delinq_2yrs  count  GroupPercentage  GroupwiseChargedOffCount  \
0            0  34020         0.892047                      4893   
1            1   3155         0.082728                       503   
2            2    657         0.017227                       128   
3            3    208         0.005454                        35   
4            4     58         0.001521                        13   
5            5     21         0.000551                         3   
6            6     10         0.000262                         1   
7            7      4         0.000105                         1   
8            8      2         0.000052                         1   

   ChargedOffPercent  
0           0.143827  
1           0.159429  
2           0.194825  
3           0.168269  
4           0.224138  
5           0.142857  
6           0.100000  
7           0.250000  
8           0.500000  

Insight

  • Higer the Value of "delinq_2yrs" Higher the Chances of Charged Off </b>

2. pub_rec_bankruptcies

In [553]:
categorical_Percentage_Distribution('pub_rec_bankruptcies', 'Distribution of pub_rec_bankruptcies and ChargedOff Subscriber% on Total')
   pub_rec_bankruptcies  count  GroupPercentage  GroupwiseChargedOffCount  \
0                   0.0  35867         0.956377                      5101   
1                   1.0   1631         0.043490                       364   
2                   2.0      5         0.000133                         2   

   ChargedOffPercent  
0           0.142220  
1           0.223176  
2           0.400000  

Insight

  • Higer the Value of "pub_rec" Higher the Chances of Charged Off </b>

3. term

In [554]:
categorical_Percentage_Distribution('term', 'Distribution of term column and ChargedOff Subscriber% on Total')
         term  count  GroupPercentage  GroupwiseChargedOffCount  \
0   36 months  28764         0.754228                      3195   
1   60 months   9373         0.245772                      2383   

   ChargedOffPercent  
0           0.111076  
1           0.254241  

Insight

  • Higer the Value of "Term" Higher the Chances of Charged Off </b>

4. pub_rec_bankruptcies

In [555]:
categorical_Percentage_Distribution('pub_rec_bankruptcies', 'Distribution of term column and ChargedOff Subscriber on Total')
   pub_rec_bankruptcies  count  GroupPercentage  GroupwiseChargedOffCount  \
0                   0.0  35867         0.956377                      5101   
1                   1.0   1631         0.043490                       364   
2                   2.0      5         0.000133                         2   

   ChargedOffPercent  
0           0.142220  
1           0.223176  
2           0.400000  

Insight

  • Higer the Value of "pub_rec" Higher the Chances of Charged Off </b>

5. revol_bal

In [556]:
print(lending_club['revol_bal'].describe(percentiles = np.arange(0.0,1.0,0.05)))
plt.figure(figsize=(15,7))
sns.distplot(lending_club['revol_bal'])
plt.show()
count     38137.000000
mean      13076.611322
std       15422.583623
min           0.000000
0%            0.000000
5%          311.000000
10%        1092.000000
15.0%      1938.400000
20%        2772.000000
25%        3634.000000
30.0%      4545.000000
35%        5482.000000
40%        6491.000000
45%        7561.200000
50%        8710.000000
55.0%      9946.800000
60.0%     11319.200000
65%       12898.000000
70%       14605.200000
75%       16741.000000
80%       19395.400000
85.0%     23163.400000
90%       28536.600000
95%       40276.000000
max      149588.000000
Name: revol_bal, dtype: float64
In [557]:
#Split RevolBal in to 3 Category, 0-3634 (25%), 3634 to 8710(50%), 8710 to 16741  (75%) and >16741 (Above 75%)
def splitRevolBal2Category(x):
    if(x <= 3634):
        return 'Qtr1'
    elif((x > 3634) & (x <=8710)):
        return 'Qtr2'
    elif((x > 8710) & (x <=16741)):
        return 'Qtr3'
    else:
        return 'Qtr4'
lending_club['Revol_Bal_Category'] = lending_club['revol_bal'].apply(splitRevolBal2Category)
In [558]:
categorical_Percentage_Distribution('Revol_Bal_Category', 'Distribution of Revol_Bal_Category column and ChargedOff Subscriber% on Total')
  Revol_Bal_Category  count  GroupPercentage  GroupwiseChargedOffCount  \
0               Qtr1   9537         0.250072                      1291   
1               Qtr2   9532         0.249941                      1386   
2               Qtr3   9535         0.250020                      1430   
3               Qtr4   9533         0.249967                      1471   

   ChargedOffPercent  
0           0.135368  
1           0.145405  
2           0.149974  
3           0.154306  

Insight

  • Higer the Value of "Revol_Bal" Higher the Chances of Charged Off </b>

6. Annual_Inc

In [559]:
print(lending_club['annual_inc'].describe(percentiles = np.arange(0.0,1.0,0.05)))
plt.figure(figsize=(15,7))
sns.distplot(lending_club['annual_inc'])
plt.show()
count     38137.000000
mean      65297.539396
std       35121.321372
min        4000.000000
0%         4000.000000
5%        24000.000000
10%       30000.000000
15.0%     33777.800000
20%       37000.000000
25%       40000.000000
30.0%     44000.000000
35%       48000.000000
40%       50000.000000
45%       54000.000000
50%       58000.000000
55.0%     60500.000000
60.0%     65000.000000
65%       70000.000000
70%       75000.000000
75%       80140.000000
80%       90000.000000
85.0%     98368.000000
90%      110748.800000
95%      135000.000000
max      234996.000000
Name: annual_inc, dtype: float64
In [560]:
#Split Annual Income in to 4 Category, 0-40000 (25%), 40000 to 58000(50%), 58000 to 80140  (75%) and >80140 (Above 75%)
def splitAnnualIncome2Category(x):
    if(x <= 40000):
        return 'Qtr1'
    elif((x > 40000) & (x <=58000)):
        return 'Qtr2'
    elif((x > 58000) & (x <=80140)):
        return 'Qtr3'
    else:
        return 'Qtr4'
lending_club['annual_inc_Category'] = lending_club['annual_inc'].apply(splitAnnualIncome2Category)
In [561]:
categorical_Percentage_Distribution('annual_inc_Category', 'Distribution of annual_inc_Category column and ChargedOff Subscriber% on Total')
  annual_inc_Category  count  GroupPercentage  GroupwiseChargedOffCount  \
0                Qtr1   9680         0.253822                      1748   
1                Qtr2   9495         0.248971                      1420   
2                Qtr3   9428         0.247214                      1339   
3                Qtr4   9534         0.249993                      1071   

   ChargedOffPercent  
0           0.180579  
1           0.149552  
2           0.142024  
3           0.112335  

Insight

  • Lower the Value of "annual_inc" Higher the Chances of Charged Off </b>

7. dti

In [562]:
print(lending_club['dti'].describe(percentiles = np.arange(0.0,1.0,0.05)))
plt.figure(figsize=(15,7))
sns.distplot(lending_club['dti'])
plt.show()
count    38137.000000
mean        13.340740
std          6.655889
min          0.000000
0%           0.000000
5%           2.180000
10%          4.140000
15.0%        5.640000
20%          7.030000
25%          8.220000
30.0%        9.340000
35%         10.400000
40%         11.420000
45%         12.480000
50%         13.440000
55.0%       14.400000
60.0%       15.380000
65%         16.420000
70%         17.500000
75%         18.600000
80%         19.750000
85.0%       20.960000
90%         22.320000
95%         23.820000
max         29.990000
Name: dti, dtype: float64
In [563]:
#Split Dti in to 4 Category, 0-8 (25%), 8 to 13(50%), 13 to 17  (75%) and >17 (Above 75%)
def splitDtiCategory(x):
    if(x <= 8):
        return 'Qtr1'
    elif((x > 8) & (x <=13)):
        return 'Qtr2'
    elif((x > 13) & (x <=17)):
        return 'Qtr3'
    else:
        return 'Qtr4'
lending_club['dti_Category'] = lending_club['dti'].apply(splitDtiCategory)
In [564]:
categorical_Percentage_Distribution('dti_Category', 'Distribution of dti_Category column and ChargedOff Subscriber% on Total')
  dti_Category  count  GroupPercentage  GroupwiseChargedOffCount  \
0         Qtr1   9178         0.240659                      1133   
1         Qtr2   9047         0.237224                      1254   
2         Qtr3   7637         0.200252                      1166   
3         Qtr4  12275         0.321866                      2025   

   ChargedOffPercent  
0           0.123447  
1           0.138609  
2           0.152678  
3           0.164969  

Insight

  • Higher the Value of "dti" Higher the Chances of Charged Off </b>

8. Grade

In [565]:
categorical_Percentage_Distribution('grade', 'Distribution of grade column and ChargedOff Subscriber% on Total')
  grade  count  GroupPercentage  GroupwiseChargedOffCount  ChargedOffPercent
0     A   9956         0.261059                       601           0.060366
1     B  11568         0.303327                      1413           0.122147
2     C   7741         0.202979                      1340           0.173104
3     D   5024         0.131736                      1111           0.221139
4     E   2607         0.068359                       703           0.269659
5     F    952         0.024963                       311           0.326681
6     G    289         0.007578                        99           0.342561

Insight

  • Higher the Value of "grade" Higher the Chances of Charged Off </b>